{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from string import letters\n",
    "import numpy as np\n",
    "from pandas import DataFrame\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_svmlight_file\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn import preprocessing\n",
    "from sklearn import cross_validation\n",
    "from sklearn import svm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = np.loadtxt(\"F:\\\\data\\\\train_all_binary.txt\",delimiter=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X = data[:,0:91]\n",
    "y = data[:,92]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(26007L, 91L)"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.,  1.,  1., ...,  0.,  1.,  1.])"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(26007L, 46L)"
      ]
     },
     "execution_count": 206,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sel = VarianceThreshold(threshold=(.8 * (1 - .8)))\n",
    "X_New=sel.fit_transform(X)\n",
    "X_New.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import preprocessing\n",
    "# normalize the data attributes\n",
    "normalized_X = preprocessing.normalize(X_New)\n",
    "# standardize the data attributes\n",
    "standardized_X = preprocessing.scale(X_New)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(26007L, 3L)"
      ]
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "clf = ExtraTreesClassifier()\n",
    "X_new = clf.fit(X_New, y).transform(X_New)\n",
    "\n",
    "X_new = LinearSVC(C=0.01, penalty=\"l2\", dual=False).fit_transform(X_new, y)\n",
    "X_new.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = np.loadtxt(\"F:\\\\data\\\\train_all_binary.txt\",delimiter=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.62579305571576882"
      ]
     },
     "execution_count": 266,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn import preprocessing\n",
    "\n",
    "clf = RandomForestClassifier()\n",
    "clf.fit(X_new, y)\n",
    "from sklearn import metrics\n",
    "predicted = cross_validation.cross_val_predict(clf,X_new,\n",
    "                                          y)\n",
    "metrics.accuracy_score(y, predicted) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 0.36341307  0.33221042  0.30437651]\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "model = ExtraTreesClassifier()\n",
    "model.fit(X_new, y)\n",
    "# display the relative importance of each attribute\n",
    "print(model.feature_importances_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0.105, 2), (0.105, 1), (0.104, 0)]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.cross_validation import cross_val_score, ShuffleSplit\n",
    "names = range(3)\n",
    "\n",
    "rf = RandomForestRegressor(n_estimators=20, max_depth=4)\n",
    "scores = []\n",
    "for i in range(X_new.shape[1]):\n",
    "     score = cross_val_score(rf, X_new[:, i:i+1], y, scoring=\"r2\",\n",
    "                              cv=ShuffleSplit(len(X_new), 3, .3))\n",
    "     scores.append((round(np.mean(score), 3), names[i]))\n",
    "print sorted(scores, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
